博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
python写的有声小说爬虫
阅读量:5034 次
发布时间:2019-06-12

本文共 5217 字,大约阅读时间需要 17 分钟。

querybook.py

from bs4 import BeautifulSoupfrom lxml import htmlimport xmlimport requestsimport spliderclass QuName:    def __init__(self,number):        self.number = number    def getPageNum(self,url):        f = requests.get(url)  # Get该网页从而获取该html内容        soup = BeautifulSoup(f.content, "lxml")        try:            pageNum = soup.find('div', class_="pagesnums").find('span').text            print('getPageNum执行成功')            return int(pageNum[3:5])        except:            print('getPageNum执行失败')        finally:            print('___________________________')    def getBookList(self):        for num in range(1,self.number):            pageNum = self.getPageNum('http://www.ting89.com/booklist/'+str(num)+'.html')            self.getBookInfo('http://www.ting89.com/booklist/'+str(num)+'.html')            print('http://www.ting89.com/booklist/'+str(num)+'.html')            for num1 in range(2,pageNum):                self.getBookInfo('http://www.ting89.com/booklist/'+str(num)+'_'+str(num1)+'.html')                print('http://www.ting89.com/booklist/'+str(num)+'_'+str(num1)+'.html')    def getBookInfo(self,url):        f = requests.get(url)  # Get该网页从而获取该html内容        soup = BeautifulSoup(f.content, "lxml")        try:            bookList = soup.find('div', class_="clist").findAll('li')            for i in bookList:                imgUrl = i.find('img')                print('书籍封面',imgUrl['src'])                # print('书名:',i.find('b').text)                pList = i.findAll('p')                for j in pList:                    print(j.text)                #下载文件                splider.YsSpider(i.find('b').text).download_files()        except:            print('getBookInfo执行失败')        finally:            print('___________________________')qn = QuName(13)         #这里是网站的类别数量(偷了个懒,直接写了个数字)qn.getBookList()

splider.py

import requestsimport urllibimport reimport osimport timeclass YsSpider:    def __init__(self, name):        self.search_name = name        self.search_url = "http://www.ting89.com/search.asp?searchword="        self.home_url = "http://www.ting89.com/books/"        self.index_pattern = r""""""        self.chapter_pattern=r"""(.+?)"""        self.down_pattern=r"""url=(.*)/(.+?)\.mp3"""        self.book_id = ''        self.book_name = ''        self.Chapter_list = []    # 返回搜索书目的id    def searchbook(self):        file = requests.get(self.search_url + urllib.parse.quote(self.search_name, encoding='gb2312'))        data = file.content.decode('gbk')        result = re.findall(self.index_pattern, data)        if len(result):            for index, i in enumerate(result):                print('%d.%s'%(index+1,i[1]))                # str = input("输入你要下载的书目名称序号: ")                str = '1'                self.book_name = result[int(str)-1][1]                self.book_id = result[int(str)-1][0]                return self.book_id            else:                print('*******没有找到你输入的相关书籍,请更换后重新运行程序*******')                exit()    def get_chapter_list(self):#获取各章节list和url        data = requests.get(self.home_url+self.searchbook()+'.html').content.decode('gbk')        result = re.findall(self.chapter_pattern, data)        return result    def _getAllUrl(self):# 获得所有的章节的下载地址        chapter_list = self.get_chapter_list()        chapter = [x[0] for x in chapter_list]        self.Chapter_list= [x[1] for x in chapter_list]        _list = [x[1] for x in chapter_list]        data = requests.get("http://www.ting89.com" + chapter[0]).content.decode('gbk')        result = re.findall(self.down_pattern, data)        # return result        return self.sub_get_url(result[0][0],_list, re.search("^0.*1$", result[0][1]))    def sub_get_url(self, down_url, _list, down_url_flag):        url = []        if down_url_flag:            xulie = list(range(len(_list)))            weishu = len(str(xulie[-1]))            for i in xulie:                i1 = i + 1                tmp_url = down_url+'/' + str(i1).zfill(weishu) + '.mp3'                url.append(urllib.request.quote(tmp_url, safe='/:?='))        else:            for item in _list:                tmp_url = down_url + '/'+item + ".mp3"                url.append(urllib.request.quote(tmp_url, safe='/:?='))        return url# 保存指定URL的文件    def save_a_file(self, url, path, chapter):        try:            print('尝试下载',chapter)            if not os.path.exists(path):                response = requests.get(url)                with open(path, 'wb') as f:                    f.write(response.content)                    f.close                    print(chapter,'保存成功')                response.close()                time.sleep(1)            else:                print('文件已经存在')        except:            print('爬取失败,已下载至',chapter,'即将重新尝试下载')            self.save_a_file(url, path, chapter)    def download_files(self):        result = self._getAllUrl()# 所有的章节对应的下载地址        root = os.path.join(os.getcwd(), self.book_name)        if not os.path.exists(root):            os.mkdir(root)        for index,i in enumerate(result):            path = os.path.join(root, self.Chapter_list[index])+'.mp3'            self.save_a_file(i, path, self.Chapter_list[index])

转载于:https://www.cnblogs.com/zddzz/p/11340433.html

你可能感兴趣的文章
hibernate生成表时,有的表可以生成,有的却不可以 2014-03-21 21:28 244人阅读 ...
查看>>
mysql-1045(28000)错误
查看>>
Ubuntu 编译出现 ISO C++ 2011 不支持的解决办法
查看>>
1.jstl c 标签实现判断功能
查看>>
Linux 常用命令——cat, tac, nl, more, less, head, tail, od
查看>>
超详细的Guava RateLimiter限流原理解析
查看>>
VueJS ElementUI el-table 的 formatter 和 scope template 不能同时存在
查看>>
Halcon一日一练:图像拼接技术
查看>>
Swift - RotateView
查看>>
iOS设计模式 - 中介者
查看>>
centos jdk 下载
查看>>
HDU 1028 Ignatius and the Princess III(母函数)
查看>>
(转)面向对象最核心的机制——动态绑定(多态)
查看>>
token简单的使用流程。
查看>>
django创建项目流程
查看>>
UIActionSheet 修改字体颜色
查看>>
Vue 框架-01- 入门篇 图文教程
查看>>
Spring注解之@Lazy注解,源码分析和总结
查看>>
多变量微积分笔记24——空间线积分
查看>>
Magento CE使用Redis的配置过程
查看>>